Imported Data

insta_data <- read_csv("instagram_data.csv")
glimpse(insta_data)
## Rows: 11,692
## Columns: 14
## $ owner_id        <chr> "36063641", "36063641", "36063641", "36063641", "36063…
## $ owner_username  <chr> "christendominique", "christendominique", "christendom…
## $ shortcode       <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33TadDM…
## $ is_video        <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ caption         <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta3 X …
## $ comments        <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884, 211…
## $ likes           <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287, 74…
## $ created_at      <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 170871…
## $ location        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl        <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.30808-6…
## $ multiple_images <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ username        <chr> "christendominique", "christendominique", "christendom…
## $ followers       <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 2144626, …
## $ following       <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, …

Mutated columns

Created new columns with calculated values

new_data<- insta_data %>% mutate(engagement = round((((likes+comments)/followers)*100),digits = 2),
                                 follower_quantile = ntile(followers,4),
                                 engagement_quantile = ntile(engagement,4),
                                 post_timestamp = as_datetime(created_at),
                                 post_time = format(round(post_timestamp,units = "hours"),format = "%H:%M"),caption_length = lengths(strsplit(caption, ' ')))

Mutated data glimpse

glimpse(new_data)
## Rows: 11,692
## Columns: 20
## $ owner_id            <chr> "36063641", "36063641", "36063641", "36063641", "3…
## $ owner_username      <chr> "christendominique", "christendominique", "christe…
## $ shortcode           <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33T…
## $ is_video            <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ caption             <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta…
## $ comments            <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884,…
## $ likes               <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287…
## $ created_at          <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 17…
## $ location            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl            <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.308…
## $ multiple_images     <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ username            <chr> "christendominique", "christendominique", "christe…
## $ followers           <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 21446…
## $ following           <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 10…
## $ engagement          <dbl> 0.78, 0.44, 0.52, 0.34, 0.81, 0.46, 2.02, 0.21, 0.…
## $ follower_quantile   <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1,…
## $ engagement_quantile <int> 3, 2, 3, 2, 3, 2, 4, 2, 2, 4, 2, 2, 1, 3, 4, 2, 3,…
## $ post_timestamp      <dttm> 2024-03-01 20:59:18, 2024-02-29 21:10:48, 2024-02…
## $ post_time           <chr> "21:00", "21:00", "21:00", "20:00", "20:00", "20:0…
## $ caption_length      <int> 12, 34, 81, 57, 17, 66, 50, 17, 8, 53, 17, 20, 90,…

Account Follower Distribution

Insights on the account follower distribution 1 is the lowest, 4 the highest

new_data %>% group_by(follower_quantile) %>% summarise(follower_mean = format(round(mean(followers),0),big.mark=','))
## # A tibble: 5 × 2
##   follower_quantile follower_mean
##               <int> <chr>        
## 1                 1 108,262      
## 2                 2 342,149      
## 3                 3 834,535      
## 4                 4 8,559,178    
## 5                NA NA

Post time vs Engament

new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(mean(engagement),n())
## # A tibble: 24 × 3
##    post_time `mean(engagement)` `n()`
##    <chr>                  <dbl> <int>
##  1 00:00                   2.08   261
##  2 01:00                   1.88   267
##  3 02:00                   1.71   238
##  4 03:00                   2.99   178
##  5 04:00                   2.37   138
##  6 05:00                   3.34   125
##  7 06:00                   2.38   145
##  8 07:00                   1.59   185
##  9 08:00                   3.81   223
## 10 09:00                   1.93   293
## # ℹ 14 more rows

*When do posts get the most engagement?

We see the most engagement between the hours of 5AM, 8AM, 12 PM, 1 PM, 4PM and 5PM, During peak times of the day. This is showing avg engagement% by post local time

time_eng <- new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(eng_mean = round(mean(engagement),1)) %>% 
  ggplot(aes(x = eng_mean,y = post_time,fill = as.factor(post_time))) +
  geom_col(stat = 'identity') +
  scale_x_continuous(labels = waiver()) +
  labs(y = 'Posting Time', x = "Avg Engagement %") +
  scale_fill_manual(values = c('05:00'="tomato",'08:00'="tomato",'12:00'="tomato",'13:00'="tomato",'16:00'="tomato",'17:00'="tomato"), guide = FALSE)

ggplotly(time_eng)

##Relationship between caption lengths and engagement

Highest engagement posts include captions with lengths x & y

new_data %>% filter(engagement != is.na(engagement)) %>% 
  mutate(caption_bucket = case_when(caption_length < 50 ~"<50",caption_length >=50 & caption_length<100~"50-100",caption_length>=100 & caption_length < 150 ~ "100-150",caption_length>=150 & caption_length < 200 ~ "150-200",caption_length>=200 & caption_length < 250 ~ "200-250",caption_length>=250 & caption_length < 300 ~ "250-300",caption_length>=300 & caption_length < 350 ~ "300-350",caption_length > 350 ~"350+")) %>% group_by(caption_bucket) %>%
  summarise(avg_eng = mean(engagement)) %>% ggplot(aes(x = caption_bucket,y = avg_eng))+geom_point(size = 5) + labs(x = "Caption Length",y = "Average Engagement")+ scale_x_discrete(limits = c("<50","50-100","100-150","150-200","200-250","250-300","300-350","350+"))

##Pictures/Videos/Carousel vs. Engagement

Do pictures or videos get more engagement? Clearly single images get more engagement and carousels get less

new_data %>% filter(engagement != is.na(engagement)) %>% mutate(type = case_when(is_video == TRUE & multiple_images == FALSE ~"Video",
                                    is_video == FALSE & multiple_images == FALSE ~ "Picture",
                                    multiple_images == TRUE ~ "Carousel")) %>% 
  group_by(type) %>% summarise(avg_eng = mean(engagement)) %>% 
  ggplot(aes(x = type,y = avg_eng))+
  geom_col() + labs(x = 'Content Type',y = 'Average Engagement')

We see pictures get more engagement

new_data %>% 
  ggplot(aes(x = likes, y = comments, group = is_video, color = is_video)) + geom_point() +scale_y_log10() + scale_x_log10() + scale_color_manual(name = "Type", labels = c("Picture", "Video"),values = c("blue","red"))